The Quota for Exercise of Parliamentary Activity says that meal expenses can be reimbursed just for the politician, excluding guests and assistants. Creating a feature with information of traveled speed from last meal can help us detect anomalies compared to other expenses.
Since we don't have in structured data the time of the expense, we want to anylize the group of expenses made in the same day.
reimbursements.xz dataset with companies.xz, so we have latitude/longitude for each expense....
import pandas as pd
import numpy as np
reimbursements = pd.read_csv('../data/2016-11-19-reimbursements.xz',
dtype={'cnpj_cpf': np.str},
low_memory=False)
reimbursements.iloc[0]
reimbursements = reimbursements[reimbursements['subquota_description'] == 'Congressperson meal']
reimbursements.shape
reimbursements['issue_date'] = pd.to_datetime(reimbursements['issue_date'], errors='coerce')
reimbursements.sort_values('issue_date', inplace=True)
companies = pd.read_csv('../data/2016-09-03-companies.xz', low_memory=False)
companies.shape
companies.iloc[0]
companies['cnpj'] = companies['cnpj'].str.replace(r'[\.\/\-]', '')
dataset = pd.merge(reimbursements, companies, left_on='cnpj_cpf', right_on='cnpj')
dataset.shape
dataset.iloc[0]
Remove party leaderships from the dataset before calculating the ranking.
dataset = dataset[dataset['congressperson_id'].notnull()]
dataset.shape
And also remove companies mistakenly geolocated outside of Brazil.
is_in_brazil = (dataset['longitude'] < -34.7916667) & \
(dataset['latitude'] < 5.2722222) & \
(dataset['latitude'] > -33.742222) & \
(dataset['longitude'] > -73.992222)
dataset = dataset[is_in_brazil]
dataset.shape
# keys = ['applicant_id', 'issue_date']
keys = ['congressperson_name', 'issue_date']
aggregation = dataset.groupby(keys)['total_net_value']. \
agg({'sum': np.sum, 'expenses': len, 'mean': np.mean})
aggregation['expenses'] = aggregation['expenses'].astype(np.int)
aggregation.sort_values(['expenses', 'sum'], ascending=[False, False]).head(10)
len(aggregation[aggregation['expenses'] > 7])
keys = ['congressperson_name', 'issue_date']
cities = dataset.groupby(keys)['city']. \
agg({'city': lambda x: len(set(x)), 'city_list': lambda x: ','.join(set(x))}).sort_values('city', ascending=False)
cities.head()
cities[cities['city'] >= 4].shape
Would be helpful for our analysis to have a new column containing the traveled distance in this given day.
from geopy.distance import vincenty as distance
from IPython.display import display
x = dataset.iloc[0]
display(x[['cnpj', 'city', 'state_y']])
distance(x[['latitude', 'longitude']],
x[['latitude', 'longitude']])
dataset.shape
dataset[['latitude', 'longitude']].dropna().shape
from itertools import tee
def pairwise(iterable):
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
a, b = tee(iterable)
next(b, None)
return zip(a, b)
def calculate_distances(x):
coordinate_list = x[['latitude', 'longitude']].values
distance_list = [distance(*coordinates_pair).km
for coordinates_pair in pairwise(coordinate_list)]
return np.nansum(distance_list)
distances = dataset.groupby(keys).apply(calculate_distances)
distances = distances.reset_index() \
.rename(columns={0: 'distance_traveled'}) \
.sort_values('distance_traveled', ascending=False)
distances.head()
Now we are not ordering the list of cities, just calculating the distance between them in the order they are in the dataset. Since we don't have the time of the expenses to know their real order, one approach is to consider the shortest path between in the cities visited in the day by the congressperson.
import networkx as nx
G = nx.Graph()
G=nx.path_graph(5)
G
path=nx.all_pairs_shortest_path(G)
path
path[0][4]
random_congressperson_day = cities[cities['city'] == 3].sample(random_state=0).reset_index().iloc[0]
matching_keys = ['congressperson_name', 'issue_date']
matches = \
(dataset['congressperson_name'] == random_congressperson_day['congressperson_name']) & \
(dataset['issue_date'] == random_congressperson_day['issue_date'])
expenses_for_graph = dataset[matches]
expenses_for_graph
def city_and_state(row):
return '{} - {}'.format(row['city'], row['state_y'])
expenses_for_graph['city_state'] = expenses_for_graph.apply(city_and_state, axis=1)
expenses_for_graph['city_state']
lat_longs = expenses_for_graph[['city_state', 'latitude', 'longitude']].values
# np.apply_along_axis(lambda x: (x[0], x[1]), axis=1, arr=lat_longs)
from itertools import combinations
list(combinations(lat_longs.tolist(), 2))
def create_node(row):
print(row[0], row[1], row[2])
cities_graph.add_node(row[0], pos=(row[1], row[2]))
return 42
cities_graph = nx.Graph()
np.apply_along_axis(create_node, axis=1, arr=lat_longs)
edges = list(combinations(lat_longs.tolist(), 2))
for edge in edges:
weight = distance(edge[0][1:], edge[1][1:]).km
print(edge[0][0], edge[1][0], weight)
cities_graph.add_edge(edge[0][0], edge[1][0], weight=weight)
# cities_graph.add_node('starting_point')
# new_edges = [('starting_point', node) for node in cities_graph.nodes()]
# cities_graph.add_edges_from(new_edges, weight=0)
cities_graph.nodes()
cities_graph.edges()
def hamilton(G):
F = [(G,[G.nodes()[0]])]
n = G.number_of_nodes()
while F:
graph,path = F.pop()
confs = []
for node in graph.neighbors(path[-1]):
conf_p = path[:]
conf_p.append(node)
conf_g = nx.Graph(graph)
conf_g.remove_node(path[-1])
confs.append((conf_g,conf_p))
for g,p in confs:
if len(p)==n:
return p
else:
F.append((g,p))
return None
hamilton(cities_graph)
# print(lat_longs)
edges = list(combinations(lat_longs.tolist(), 2))
np.sum([distance(edge[0][1:], edge[1][1:]).km for edge in edges])
def calculate_sum_distances(x):
coordinate_list = x[['latitude', 'longitude']].values
edges = list(combinations(coordinate_list, 2))
return np.sum([distance(edge[0][1:], edge[1][1:]).km for edge in edges])
distances = dataset.groupby(keys).apply(calculate_sum_distances)
distances = distances.reset_index() \
.rename(columns={0: 'distance_traveled'}) \
.sort_values('distance_traveled', ascending=False)
distances.head()
dataset_with_distances = \
pd.merge(aggregation.reset_index(),
distances,
left_on=keys,
right_on=keys)
dataset_with_distances.sort_values(['distance_traveled', 'expenses'], ascending=[False, False]).head(10)
from altair import Chart
Chart(dataset_with_distances).mark_point().encode(
x='expenses',
y='distance_traveled',
)